We download the data for AirBnb in Rio

listings <- vroom("http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2020-06-19/data/listings.csv.gz") %>% 
    clean_names()

Quick description of some of the variables collected, with cost data typically expressed in US$

# We try to concert to numerical variable when it is possible

numeric_listings <- listings %>% 
  mutate(price = parse_number(price), 
         cleaning_fee = parse_number(cleaning_fee),
         extra_people = parse_number(extra_people),
         weekly_price = parse_number(weekly_price),
         monthly_price = parse_number(monthly_price),
         security_deposit = parse_number(security_deposit)
         )

# Exploratory Data Analysis (EDA)

We will analyse the raw values

dplyr::glimpse(listings)
## Rows: 35,731
## Columns: 106
## $ id                                           <dbl> 17878, 21280, 25026, 315…
## $ listing_url                                  <chr> "https://www.airbnb.com/…
## $ scrape_id                                    <dbl> 2.020062e+13, 2.020062e+…
## $ last_scraped                                 <date> 2020-06-19, 2020-06-19,…
## $ name                                         <chr> "Very Nice 2Br in Copaca…
## $ summary                                      <chr> "Discounts for long term…
## $ space                                        <chr> "- Beautiful, sunny 2 be…
## $ description                                  <chr> "Discounts for long term…
## $ experiences_offered                          <chr> "none", "none", "none", …
## $ neighborhood_overview                        <chr> "This is the one of the …
## $ notes                                        <chr> NA, NA, "For any stay su…
## $ transit                                      <chr> "Excellent location. Clo…
## $ access                                       <chr> "The entire apartment is…
## $ interaction                                  <chr> "I will be available thr…
## $ house_rules                                  <chr> "Please leave the apartm…
## $ thumbnail_url                                <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url                                   <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url                                  <chr> "https://a0.muscache.com…
## $ xl_picture_url                               <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id                                      <dbl> 68997, 81163, 102840, 13…
## $ host_url                                     <chr> "https://www.airbnb.com/…
## $ host_name                                    <chr> "Matthias", "Jules", "Vi…
## $ host_since                                   <date> 2010-01-08, 2010-02-14,…
## $ host_location                                <chr> "Rio de Janeiro, State o…
## $ host_about                                   <chr> "I  am a  journalist/wri…
## $ host_response_time                           <chr> "within an hour", "withi…
## $ host_response_rate                           <chr> "100%", "100%", "70%", "…
## $ host_acceptance_rate                         <chr> "100%", "90%", "77%", "1…
## $ host_is_superhost                            <lgl> TRUE, FALSE, FALSE, TRUE…
## $ host_thumbnail_url                           <chr> "https://a0.muscache.com…
## $ host_picture_url                             <chr> "https://a0.muscache.com…
## $ host_neighbourhood                           <chr> "Copacabana", "Ipanema",…
## $ host_listings_count                          <dbl> 2, 0, 3, 1, 1, 1, 7, 2, …
## $ host_total_listings_count                    <dbl> 2, 0, 3, 1, 1, 1, 7, 2, …
## $ host_verifications                           <chr> "['email', 'phone', 'rev…
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified                       <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ street                                       <chr> "Rio de Janeiro, Rio de …
## $ neighbourhood                                <chr> "Copacabana", "Ipanema",…
## $ neighbourhood_cleansed                       <chr> "Copacabana", "Ipanema",…
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, …
## $ city                                         <chr> "Rio de Janeiro", "Rio d…
## $ state                                        <chr> "Rio de Janeiro", "RJ", …
## $ zipcode                                      <chr> "22020-050", "22420-010"…
## $ market                                       <chr> "Rio De Janeiro", "Rio D…
## $ smart_location                               <chr> "Rio de Janeiro, Brazil"…
## $ country_code                                 <chr> "BR", "BR", "BR", "BR", …
## $ country                                      <chr> "Brazil", "Brazil", "Bra…
## $ latitude                                     <dbl> -22.96592, -22.98467, -2…
## $ longitude                                    <dbl> -43.17896, -43.19611, -4…
## $ is_location_exact                            <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ property_type                                <chr> "Condominium", "Apartmen…
## $ room_type                                    <chr> "Entire home/apt", "Enti…
## $ accommodates                                 <dbl> 5, 6, 2, 3, 2, 2, 13, 1,…
## $ bathrooms                                    <dbl> 1.0, 2.0, 1.0, 1.0, 1.5,…
## $ bedrooms                                     <dbl> 2, 2, 1, 1, 1, 1, 6, 1, …
## $ beds                                         <dbl> 2, 4, 2, 1, 1, 1, 6, 1, …
## $ bed_type                                     <chr> "Real Bed", "Real Bed", …
## $ amenities                                    <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet                                  <dbl> NA, NA, NA, NA, NA, 0, N…
## $ price                                        <chr> "$191.00", "$424.00", "$…
## $ weekly_price                                 <chr> NA, "$4,945.00", NA, NA,…
## $ monthly_price                                <chr> NA, "$17,455.00", NA, NA…
## $ security_deposit                             <chr> "$0.00", "$2,647.00", "$…
## $ cleaning_fee                                 <chr> "$250.00", "$265.00", "$…
## $ guests_included                              <dbl> 2, 6, 2, 2, 2, 2, 7, 1, …
## $ extra_people                                 <chr> "$0.00", "$0.00", "$45.0…
## $ minimum_nights                               <dbl> 7, 5, 7, 2, 2, 3, 2, 3, …
## $ maximum_nights                               <dbl> 180, 30, 60, 1125, 89, 2…
## $ minimum_minimum_nights                       <dbl> 7, 5, 7, 2, 2, 3, 2, 3, …
## $ maximum_minimum_nights                       <dbl> 7, 5, 7, 2, 2, 3, 2, 3, …
## $ minimum_maximum_nights                       <dbl> 1125, 30, 60, 1125, 89, …
## $ maximum_maximum_nights                       <dbl> 1125, 30, 60, 1125, 89, …
## $ minimum_nights_avg_ntm                       <dbl> 7.0, 5.0, 7.0, 2.0, 2.0,…
## $ maximum_nights_avg_ntm                       <dbl> 1125, 30, 60, 1125, 89, …
## $ calendar_updated                             <chr> "4 months ago", "4 month…
## $ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30                              <dbl> 0, 19, 0, 0, 0, 23, 27, …
## $ availability_60                              <dbl> 6, 49, 0, 0, 0, 52, 54, …
## $ availability_90                              <dbl> 36, 79, 0, 0, 0, 71, 84,…
## $ availability_365                             <dbl> 287, 93, 154, 0, 170, 12…
## $ calendar_last_scraped                        <date> 2020-06-19, 2020-06-19,…
## $ number_of_reviews                            <dbl> 251, 94, 238, 282, 181, …
## $ number_of_reviews_ltm                        <dbl> 20, 6, 7, 25, 23, 33, 22…
## $ first_review                                 <date> 2010-07-15, 2014-02-14,…
## $ last_review                                  <date> 2020-04-06, 2020-03-22,…
## $ review_scores_rating                         <dbl> 93, 97, 94, 96, 94, 98, …
## $ review_scores_accuracy                       <dbl> 9, 10, 9, 10, 10, 10, 10…
## $ review_scores_cleanliness                    <dbl> 10, 10, 9, 10, 9, 10, 9,…
## $ review_scores_checkin                        <dbl> 10, 10, 9, 10, 10, 10, 1…
## $ review_scores_communication                  <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_location                       <dbl> 10, 10, 10, 10, 10, 10, …
## $ review_scores_value                          <dbl> 9, 10, 9, 10, 9, 10, 9, …
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, FAL…
## $ license                                      <lgl> NA, NA, NA, NA, NA, NA, …
## $ jurisdiction_names                           <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable                             <lgl> TRUE, FALSE, FALSE, TRUE…
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy                          <chr> "moderate", "strict_14_w…
## $ require_guest_profile_picture                <lgl> FALSE, FALSE, TRUE, FALS…
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, TRUE, FALS…
## $ calculated_host_listings_count               <dbl> 1, 1, 3, 1, 1, 1, 5, 1, …
## $ calculated_host_listings_count_entire_homes  <dbl> 1, 1, 3, 1, 1, 1, 3, 0, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 2, 1, …
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month                            <dbl> 2.08, 1.22, 1.95, 2.33, …

There are 106 columns with 35,731 rows. On all this data, we have 46 columns with character variables, 5 with dates, 16 with logical and 39 columns with numbers.

Computing summary statistics of the variables of interest, or finding NAs

skimr::skim(listings)
Data summary
Name listings
Number of rows 35731
Number of columns 106
_______________________
Column type frequency:
character 46
Date 5
logical 16
numeric 39
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 34 37 0 35731 0
name 58 1.00 1 255 0 34488 0
summary 2588 0.93 1 1000 0 31747 0
space 14186 0.60 1 1000 0 20709 0
description 1433 0.96 1 1000 0 33409 0
experiences_offered 0 1.00 4 4 0 1 0
neighborhood_overview 15776 0.56 1 1000 0 18303 0
notes 24127 0.32 1 1000 0 10380 0
transit 15967 0.55 1 1000 0 18237 0
access 19597 0.45 1 1000 0 14325 0
interaction 18251 0.49 1 1000 0 15456 0
house_rules 17304 0.52 1 1000 0 16288 0
picture_url 0 1.00 81 146 0 35146 0
host_url 0 1.00 39 43 0 24992 0
host_name 5 1.00 1 35 0 6429 0
host_location 168 1.00 2 253 0 1180 0
host_about 18793 0.47 1 8586 0 9714 22
host_response_time 5 1.00 3 18 0 5 0
host_response_rate 5 1.00 2 4 0 63 0
host_acceptance_rate 5 1.00 2 4 0 100 0
host_thumbnail_url 5 1.00 55 106 0 24859 0
host_picture_url 5 1.00 57 109 0 24859 0
host_neighbourhood 13638 0.62 3 33 0 164 0
host_verifications 0 1.00 2 158 0 345 0
street 0 1.00 10 112 0 502 0
neighbourhood 1892 0.95 3 24 0 101 0
neighbourhood_cleansed 0 1.00 3 24 0 156 0
city 162 1.00 1 69 0 317 0
state 35 1.00 1 51 0 52 0
zipcode 1752 0.95 1 24 0 4261 1
market 43 1.00 4 21 0 11 0
smart_location 0 1.00 9 77 0 341 0
country_code 0 1.00 2 2 0 1 0
country 0 1.00 6 6 0 1 0
property_type 0 1.00 3 22 0 36 0
room_type 0 1.00 10 15 0 4 0
bed_type 0 1.00 5 13 0 5 0
amenities 0 1.00 2 1267 0 33605 0
price 0 1.00 5 11 0 1117 0
weekly_price 33335 0.07 7 11 0 758 0
monthly_price 33070 0.07 7 11 0 861 0
security_deposit 15680 0.56 5 10 0 737 0
cleaning_fee 11584 0.68 5 9 0 461 0
extra_people 0 1.00 5 9 0 334 0
calendar_updated 0 1.00 5 14 0 90 0
cancellation_policy 0 1.00 6 27 0 6 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2020-06-19 2020-06-21 2020-06-19 3
host_since 5 1.00 2009-03-29 2020-06-17 2016-01-26 3301
calendar_last_scraped 0 1.00 2020-06-19 2020-06-21 2020-06-19 3
first_review 14991 0.58 2010-06-07 2020-06-18 2018-02-14 2506
last_review 14991 0.58 2012-02-21 2020-06-19 2020-01-31 1529

Variable type: logical

skim_variable n_missing complete_rate mean count
thumbnail_url 35731 0 NaN :
medium_url 35731 0 NaN :
xl_picture_url 35731 0 NaN :
host_is_superhost 5 1 0.13 FAL: 31028, TRU: 4698
host_has_profile_pic 5 1 1.00 TRU: 35588, FAL: 138
host_identity_verified 5 1 0.27 FAL: 26186, TRU: 9540
neighbourhood_group_cleansed 35731 0 NaN :
is_location_exact 0 1 0.69 TRU: 24496, FAL: 11235
has_availability 0 1 1.00 TRU: 35731
requires_license 0 1 0.00 FAL: 35731
license 35731 0 NaN :
jurisdiction_names 35731 0 NaN :
instant_bookable 0 1 0.40 FAL: 21360, TRU: 14371
is_business_travel_ready 0 1 0.00 FAL: 35731
require_guest_profile_picture 0 1 0.02 FAL: 35176, TRU: 555
require_guest_phone_verification 0 1 0.02 FAL: 35177, TRU: 554

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 2.131976e+07 13709734.81 1.787800e+04 1.098372e+07 1.707225e+07 3.428583e+07 4.384991e+07 ▅▇▃▃▆
scrape_id 0 1.00 2.020062e+13 0.00 2.020062e+13 2.020062e+13 2.020062e+13 2.020062e+13 2.020062e+13 ▁▁▇▁▁
host_id 0 1.00 8.502863e+07 89449682.73 1.173900e+04 1.502522e+07 5.623333e+07 1.122780e+08 3.505372e+08 ▇▃▁▁▁
host_listings_count 5 1.00 3.576000e+01 312.28 0.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00 3.316000e+03 ▇▁▁▁▁
host_total_listings_count 5 1.00 3.576000e+01 312.28 0.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00 3.316000e+03 ▇▁▁▁▁
latitude 0 1.00 -2.297000e+01 0.03 -2.307000e+01 -2.298000e+01 -2.297000e+01 -2.295000e+01 -2.275000e+01 ▁▇▃▁▁
longitude 0 1.00 -4.325000e+01 0.10 -4.374000e+01 -4.332000e+01 -4.320000e+01 -4.319000e+01 -4.310000e+01 ▁▁▂▂▇
accommodates 0 1.00 4.180000e+00 2.63 1.000000e+00 2.000000e+00 4.000000e+00 5.000000e+00 1.600000e+02 ▇▁▁▁▁
bathrooms 67 1.00 1.670000e+00 1.05 0.000000e+00 1.000000e+00 1.000000e+00 2.000000e+00 2.000000e+01 ▇▁▁▁▁
bedrooms 79 1.00 1.620000e+00 1.09 0.000000e+00 1.000000e+00 1.000000e+00 2.000000e+00 3.000000e+01 ▇▁▁▁▁
beds 331 0.99 2.540000e+00 2.08 0.000000e+00 1.000000e+00 2.000000e+00 3.000000e+00 6.200000e+01 ▇▁▁▁▁
square_feet 35279 0.01 5.252900e+02 983.16 0.000000e+00 0.000000e+00 1.940000e+02 7.530000e+02 1.076400e+04 ▇▁▁▁▁
guests_included 0 1.00 1.730000e+00 1.60 1.000000e+00 1.000000e+00 1.000000e+00 2.000000e+00 3.600000e+01 ▇▁▁▁▁
minimum_nights 0 1.00 4.910000e+00 22.27 1.000000e+00 1.000000e+00 2.000000e+00 4.000000e+00 1.123000e+03 ▇▁▁▁▁
maximum_nights 0 1.00 2.865507e+04 5290264.02 1.000000e+00 3.000000e+01 1.125000e+03 1.125000e+03 1.000000e+09 ▇▁▁▁▁
minimum_minimum_nights 0 1.00 4.780000e+00 21.18 1.000000e+00 1.000000e+00 2.000000e+00 4.000000e+00 1.095000e+03 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 5.120000e+00 21.42 1.000000e+00 1.000000e+00 3.000000e+00 5.000000e+00 1.095000e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 2.873711e+04 5290263.58 1.000000e+00 4.000000e+01 1.125000e+03 1.125000e+03 1.000000e+09 ▇▁▁▁▁
maximum_maximum_nights 0 1.00 2.874036e+04 5290263.56 1.000000e+00 4.400000e+01 1.125000e+03 1.125000e+03 1.000000e+09 ▇▁▁▁▁
minimum_nights_avg_ntm 0 1.00 4.890000e+00 21.29 1.000000e+00 1.000000e+00 2.100000e+00 4.000000e+00 1.095000e+03 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 2.873929e+04 5290263.57 1.000000e+00 4.350000e+01 1.125000e+03 1.125000e+03 1.000000e+09 ▇▁▁▁▁
availability_30 0 1.00 1.654000e+01 13.92 0.000000e+00 0.000000e+00 2.700000e+01 3.000000e+01 3.000000e+01 ▆▁▁▁▇
availability_60 0 1.00 3.439000e+01 27.90 0.000000e+00 0.000000e+00 5.400000e+01 5.900000e+01 6.000000e+01 ▆▁▁▁▇
availability_90 0 1.00 5.255000e+01 41.77 0.000000e+00 0.000000e+00 8.300000e+01 8.900000e+01 9.000000e+01 ▆▁▁▁▇
availability_365 0 1.00 1.723200e+02 154.74 0.000000e+00 0.000000e+00 1.680000e+02 3.620000e+02 3.650000e+02 ▇▃▂▁▇
number_of_reviews 0 1.00 9.630000e+00 25.64 0.000000e+00 0.000000e+00 1.000000e+00 6.000000e+00 4.060000e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 3.060000e+00 6.93 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00 1.050000e+02 ▇▁▁▁▁
review_scores_rating 15873 0.56 9.458000e+01 9.58 2.000000e+01 9.300000e+01 9.800000e+01 1.000000e+02 1.000000e+02 ▁▁▁▁▇
review_scores_accuracy 15892 0.56 9.640000e+00 0.92 2.000000e+00 1.000000e+01 1.000000e+01 1.000000e+01 1.000000e+01 ▁▁▁▁▇
review_scores_cleanliness 15887 0.56 9.390000e+00 1.13 2.000000e+00 9.000000e+00 1.000000e+01 1.000000e+01 1.000000e+01 ▁▁▁▁▇
review_scores_checkin 15893 0.56 9.810000e+00 0.72 2.000000e+00 1.000000e+01 1.000000e+01 1.000000e+01 1.000000e+01 ▁▁▁▁▇
review_scores_communication 15885 0.56 9.760000e+00 0.81 2.000000e+00 1.000000e+01 1.000000e+01 1.000000e+01 1.000000e+01 ▁▁▁▁▇
review_scores_location 15891 0.56 9.750000e+00 0.74 2.000000e+00 1.000000e+01 1.000000e+01 1.000000e+01 1.000000e+01 ▁▁▁▁▇
review_scores_value 15889 0.56 9.290000e+00 1.06 2.000000e+00 9.000000e+00 1.000000e+01 1.000000e+01 1.000000e+01 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 8.070000e+00 34.23 1.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00 3.190000e+02 ▇▁▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 7.200000e+00 33.90 0.000000e+00 1.000000e+00 1.000000e+00 2.000000e+00 3.170000e+02 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 7.500000e-01 1.59 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 2.000000e+01 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 9.000000e-02 0.65 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.100000e+01 ▇▁▁▁▁
reviews_per_month 14991 0.58 5.500000e-01 0.73 1.000000e-02 1.100000e-01 2.600000e-01 6.800000e-01 8.550000e+00 ▇▁▁▁▁

Below we analyze which variables are numbers

numeric_listings <- numeric_listings %>% 
  select(price, 
         weekly_price,
         monthly_price,
         security_deposit,
         cleaning_fee,
         guests_included,
         extra_people,
         property_type,
         room_type,
         accommodates,
         bathrooms,
         bedrooms,
         beds,
         square_feet,
         neighbourhood,
         neighbourhood_cleansed,
         neighbourhood_group_cleansed,
         is_location_exact,
         latitude,
         longitude,
         number_of_reviews,
         number_of_reviews_ltm,
         review_scores_rating,
         minimum_nights,
         maximum_nights,
         is_business_travel_ready,
         cancellation_policy,
         host_is_superhost,
         host_identity_verified,
         listing_url
         )

Computing Summary Statistics

We find the missing values after having computed summary statistics of the variable of interest

numeric_listings <- numeric_listings%>%
  mutate(neighbourhood = as.factor(neighbourhood),
         neighbourhood_cleansed = as.factor(neighbourhood_cleansed),
         neighbourhood_group_cleansed = as.factor(neighbourhood_group_cleansed),
         room_type = as.factor(room_type),
         cancellation_policy = as.factor(cancellation_policy),
         host_is_superhost = as.factor(host_is_superhost))
summary(numeric_listings)
##      price           weekly_price    monthly_price    security_deposit
##  Min.   :     0.0   Min.   :   110   Min.   :   360   Min.   :    0   
##  1st Qu.:   155.0   1st Qu.:  1120   1st Qu.:  3444   1st Qu.:    0   
##  Median :   300.0   Median :  2000   Median :  6000   Median :  500   
##  Mean   :   785.9   Mean   :  4361   Mean   : 16609   Mean   : 1191   
##  3rd Qu.:   642.0   3rd Qu.:  4818   3rd Qu.: 16000   3rd Qu.: 1000   
##  Max.   :132358.0   Max.   :133832   Max.   :582374   Max.   :79415   
##                     NA's   :33335    NA's   :33070    NA's   :15680   
##   cleaning_fee    guests_included   extra_people     property_type     
##  Min.   :   0.0   Min.   : 1.000   Min.   :   0.00   Length:35731      
##  1st Qu.:  89.0   1st Qu.: 1.000   1st Qu.:   0.00   Class :character  
##  Median : 150.0   Median : 1.000   Median :   0.00   Mode  :character  
##  Mean   : 164.3   Mean   : 1.728   Mean   :  42.35                     
##  3rd Qu.: 200.0   3rd Qu.: 2.000   3rd Qu.:  50.00                     
##  Max.   :7412.0   Max.   :36.000   Max.   :5552.00                     
##  NA's   :11584                                                         
##            room_type      accommodates       bathrooms         bedrooms     
##  Entire home/apt:25638   Min.   :  1.000   Min.   : 0.000   Min.   : 0.000  
##  Hotel room     :  173   1st Qu.:  2.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Private room   : 9143   Median :  4.000   Median : 1.000   Median : 1.000  
##  Shared room    :  777   Mean   :  4.184   Mean   : 1.671   Mean   : 1.623  
##                          3rd Qu.:  5.000   3rd Qu.: 2.000   3rd Qu.: 2.000  
##                          Max.   :160.000   Max.   :20.000   Max.   :30.000  
##                                            NA's   :67       NA's   :79      
##       beds         square_feet                       neighbourhood  
##  Min.   : 0.000   Min.   :    0.0   Copacabana              : 9262  
##  1st Qu.: 1.000   1st Qu.:    0.0   Barra da Tijuca         : 6037  
##  Median : 2.000   Median :  194.0   Ipanema                 : 3069  
##  Mean   : 2.544   Mean   :  525.3   Recreio dos Bandeirantes: 1832  
##  3rd Qu.: 3.000   3rd Qu.:  753.0   Botafogo                : 1765  
##  Max.   :62.000   Max.   :10764.0   (Other)                 :11874  
##  NA's   :331      NA's   :35279     NA's                    : 1892  
##               neighbourhood_cleansed neighbourhood_group_cleansed
##  Copacabana              : 9293      NA's:35731                  
##  Barra da Tijuca         : 4023                                  
##  Ipanema                 : 3024                                  
##  Jacarepaguá             : 2030                                  
##  Botafogo                : 1762                                  
##  Recreio dos Bandeirantes: 1685                                  
##  (Other)                 :13914                                  
##  is_location_exact    latitude        longitude      number_of_reviews
##  Mode :logical     Min.   :-23.07   Min.   :-43.74   Min.   :  0.000  
##  FALSE:11235       1st Qu.:-22.98   1st Qu.:-43.32   1st Qu.:  0.000  
##  TRUE :24496       Median :-22.97   Median :-43.20   Median :  1.000  
##                    Mean   :-22.97   Mean   :-43.25   Mean   :  9.634  
##                    3rd Qu.:-22.95   3rd Qu.:-43.19   3rd Qu.:  6.000  
##                    Max.   :-22.75   Max.   :-43.10   Max.   :406.000  
##                                                                       
##  number_of_reviews_ltm review_scores_rating minimum_nights    
##  Min.   :  0.00        Min.   : 20.00       Min.   :   1.000  
##  1st Qu.:  0.00        1st Qu.: 93.00       1st Qu.:   1.000  
##  Median :  0.00        Median : 98.00       Median :   2.000  
##  Mean   :  3.06        Mean   : 94.58       Mean   :   4.907  
##  3rd Qu.:  2.00        3rd Qu.:100.00       3rd Qu.:   4.000  
##  Max.   :105.00        Max.   :100.00       Max.   :1123.000  
##                        NA's   :15873                          
##  maximum_nights      is_business_travel_ready
##  Min.   :        1   Mode :logical           
##  1st Qu.:       30   FALSE:35731             
##  Median :     1125                           
##  Mean   :    28655                           
##  3rd Qu.:     1125                           
##  Max.   :999999999                           
##                                              
##                   cancellation_policy host_is_superhost host_identity_verified
##  flexible                   :14893    FALSE:31028       Mode :logical         
##  moderate                   : 6418    TRUE : 4698       FALSE:26186           
##  strict                     :    3    NA's :    5       TRUE :9540            
##  strict_14_with_grace_period:14191                      NA's :5               
##  super_strict_30            :   88                                            
##  super_strict_60            :  138                                            
##                                                                               
##  listing_url       
##  Length:35731      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
# Analyzing variable of interest price on its own and when computed and combined with other variables

# Price ?
favstats(~price, data = numeric_listings)
minQ1medianQ3maxmeansdnmissing
01553006421.32e+057862.38e+03357310
favstats(~weekly_price, data = numeric_listings)
minQ1medianQ3maxmeansdnmissing
1101.12e+032e+034.82e+031.34e+054.36e+037.58e+03239633335
# Price per value of certain categorical variables
favstats(price~property_type, data = numeric_listings)
property_typeminQ1medianQ3maxmeansdnmissing
Aparthotel70      142       184       392       2.01e+03362       401       300
Apartment0      169       302       672       5.35e+04746       1.88e+03274500
Barn529      529       529       529       529       529              10
Bed and breakfast0      91       139       298       1.32e+052.33e+031.17e+041590
Boat80      1.59e+033e+03       6.5e+03 4e+04       5.52e+038.56e+03210
Boutique hotel53      294       691       5e+03       1.2e+04 3.22e+033.95e+03350
Bungalow37      112       150       321       1e+03       270       293       90
Cabin70      416       1e+03       1.39e+033.9e+03 1.18e+031.13e+03100
Camper/RV252      252       252       252       252       252              10
Campsite101      476       850       1.22e+031.6e+03 850       1.06e+0320
Casa particular (Cuba)37      54.8     88.5     114       2.06e+03408       813       60
Castle300      318       336       353       371       336       50.2     20
Chalet54      107       163       258       578       199       133       320
Condominium37      150       284       498       5.35e+04532       1.77e+0320780
Cottage102      224       362       752       6e+03       837       1.34e+03380
Earth house37      83       118       289       3.75e+03317       638       350
Farm stay64      2.27e+034.47e+036.68e+038.88e+034.47e+036.23e+0320
Guest suite0      104       150       249       5.95e+03264       565       2270
Guesthouse0      96       142       246       1.6e+03 213       248       1380
Hostel37      48       72.5     131       2.5e+04 1.67e+036e+03       980
Hotel53      187       336       584       2.5e+04 1.31e+033.56e+031140
House0      118       268       854       5.35e+041.19e+033.5e+03 35630
Houseboat252      1.88e+033.5e+03 1.67e+043e+04       1.13e+041.63e+0430
Hut79      79       79       79       79       79              10
Island150      196       256       400       701       341       248       40
Loft37      139       201       344       1e+04       438       860       7090
Nature lodge54      70       79       101       4.28e+03917       1.88e+0350
Other37      102       249       600       1.07e+04710       1.43e+03770
Serviced apartment53      182       270       424       1.89e+04423       860       6550
Tent43      99       155       578       1e+03       400       524       30
Tiny house37      86       160       252       1.62e+03241       287       420
Townhouse43      97.2     150       260       2.81e+03281       439       620
Treehouse225      370       514       658       803       514       409       20
Vacation home2.5e+032.5e+03 2.5e+03 2.5e+03 2.5e+03 2.5e+03        10
Villa43      150       450       2.36e+031.32e+053.78e+031.37e+041150
Yurt171      171       171       171       171       171              10
favstats(price~neighbourhood, data = numeric_listings)
neighbourhoodminQ1medianQ3maxmeansdnmissing
Abolição64150  200      424       1e+03       349       352       60
Anchieta101438  699      1.42e+034.82e+031.33e+031.68e+0370
Andaraí4280  198      600       1.7e+04 746       2.05e+03850
Barra da Tijuca0225  450      1e+03       5.35e+041.11e+032.81e+0360370
Barra de Guaratiba43187  300      600       4.82e+041.09e+035.16e+03870
Barros Filho7078  86      94       102       86       22.6     20
Benfica43101  402      1e+03       3e+03       820       1.03e+0390
Bento Ribeiro64118  177      455       1.2e+03 319       300       150
Bonsucesso4359  72.5    114       1.5e+03 152       304       220
Botafogo37127  219      450       3.5e+04 442       1.33e+0317650
Brás de Pina3764  80      102       498       108       96.2     270
Cachambi32102  198      391       1.06e+03289       242       410
Caju54105  134      408       1.2e+03 380       547       40
Cascadura4896  150      498       3.55e+042.38e+038.54e+03170
Catete37128  201      448       8.89e+03892       2.09e+033430
Catumbi43101  402      600       669       363       284       50
Cavalcante348348  348      348       348       348              10
Centro37101  150      300       2.14e+04665       1.91e+035730
Cidade Nova59102  150      314       3e+03       331       559       280
Coelho Neto3742.548      131       214       99.7     99.2     30
Colégio7072.274.5    76.8     79       74.5     6.36    20
Complexo da Maré150150  150      150       150       150              10
Complexo de Alemão37178  318      459       600       318       398       20
Copacabana0159  252      498       1.32e+05657       2.31e+0392620
Cordovil4860  136      250       371       173       151       40
Cosme Velho37143  300      498       2e+04       573       1.88e+031150
Del Castilho4885  209      440       1e+03       283       240       230
Deodoro201339  476      614       752       476       390       20
Encantado70112  166      257       3.18e+03356       679       210
Engenheiro Leal102451  800      1.15e+031.5e+03 800       988       20
Engenho da Rainha53105  276      466       1.61e+03420       507       80
Engenho de Dentro43153  389      598       2.14e+03480       431       800
Engenho Novo4370  145      400       2e+03       309       399       510
Estacio3780  116      209       1.8e+03 257       371       890
Flamengo37127  225      450       3.5e+04 617       1.75e+039410
Gamboa37118  198      498       8.89e+03977       2.17e+03330
Gávea54223  388      916       1.07e+04883       1.39e+032980
Glória42111  180      349       3e+04       452       1.8e+03 3490
Grajaú37102  186      408       8e+03       479       1.08e+03960
Grumari91194  296      598       899       429       420       30
Guadalupe90155  268      498       1e+03       361       271       130
Higienópolis48269  354      480       1.5e+03 436       403       100
Honório Gurgel6477.591      104       118       91       38.2     20
Humaitá42176  302      596       1.5e+04 544       1.01e+032940
Inhaúma5473.5193      326       798       267       262       80
Ipanema32238  402      799       4.57e+04895       2.05e+0330690
Irajá4359  214      382       1.33e+03316       343       360
Jacaré8092  107      148       236       132       70.7     40
Jardim América7070  70      70       70       70              10
Jardim Botânico37184  359      799       1.5e+04 891       1.7e+03 3270
Joá69261  1.5e+034.05e+032.68e+044.01e+036.73e+031000
Lagoa54252  498      1.1e+03 3.14e+041.08e+032.17e+033620
Lapa32102  171      351       1.07e+04674       1.76e+035120
Laranjeiras37122  229      498       5.35e+04520       2.07e+037320
Leblon48286  450      872       2.5e+04 884       1.48e+0316120
Leme37169  296      582       3.5e+04 611       1.73e+036560
Lins de Vasconcelos4888.2141      243       699       213       191       160
Madureira59114  161      618       1.23e+03377       381       190
Mangueira3742  58      79       90       61.2     23       50
Manguinhos86240  394      547       701       394       435       20
Maracanã43115  332      1e+03       1.06e+04717       1.05e+032790
Marechal Hermes5990  134      241       1.8e+03 284       414       190
Maria da Graça3784.8160      377       1.78e+03398       552       140
Méier43102  201      300       2.12e+03277       304       570
Olaria4880  98.5    306       1.34e+03267       393       100
Oswaldo Cruz69169  401      498       2.14e+03528       552       130
Parada de Lucas6470  80      118       701       207       277       50
Parque Anchieta5469.5150      252       2.14e+03381       632       110
Parque Colúmbia198198  198      198       198       198              10
Pavuna134215  300      384       1.07e+03401       342       60
Penha4370  91      198       1.5e+03 193       311       210
Penha Circular79115  184      244       252       175       85.3     40
Piedade101124  170      535       857       310       241       160
Pilares3764  91      375       584       210       211       90
Praça da Bandeira48102  198      628       5.35e+03471       711       760
Quintino Bocaiúva3788  236      750       2.65e+03615       832       110
Ramos4867.2185      399       2e+04       2.5e+03 5.96e+03120
Recreio dos Bandeirantes32193  353      900       3.76e+04822       1.64e+0318320
Riachuelo53139  428      648       1.61e+03468       406       250
Ricardo de Albuquerque118357  800      1.77e+033.21e+031.21e+031.19e+0360
Rio Comprido42102  187      498       2.89e+03370       453       1230
Rocha43110  300      668       2.12e+03463       492       590
Rocha Miranda101200  300      549       798       400       359       30
Rocinha4871.2107      626       2.38e+03438       616       220
Sampaio5386  112      321       1.2e+03 260       307       170
Santa Teresa32102  182      344       5.29e+04575       2.52e+0311070
Santo Cristo3787.5225      406       8.89e+031.2e+03 2.6e+03 120
São Conrado43219  498      1.75e+035.57e+042.17e+035.57e+032480
São Cristóvão37102  150      434       2.9e+03 379       543       1230
Saúde4898.5246      646       3.65e+042.91e+037e+03       310
Tijuca32102  198      551       3.3e+04 546       1.52e+037220
Todos os Santos43104  198      428       5.35e+03523       969       390
Tomás Coelho6470  90      123       1.07e+03230       372       70
Urca54160  289      574       1e+04       720       1.25e+031420
Vaz Lobo300650  1e+03      1.17e+031.34e+03880       530       30
Vidigal37102  198      404       2.5e+04 1.56e+035.01e+031760
Vigário Geral5982.5106      200       294       153       124       30
Vila da Penha5996.5150      310       1.1e+03 239       260       150
Vila Isabel32102  249      687       1.12e+04567       944       3100
Vila Kosmos6464  72      82.5     90       74.5     12.8     40
Vincente de Carvalho3751  900      1.6e+03 1.78e+03861       811       70
favstats(price~room_type, data = numeric_listings)
room_typeminQ1medianQ3maxmeansdnmissing
Entire home/apt02013918031.32e+059572.54e+03256380
Hotel room0961853971e+04       9762.23e+031730
Private room01011502961.32e+053281.79e+0391430
Shared room32591012012.5e+04 4772.39e+037770
# Summary statistics for numeric variables that might influence price
favstats(~cleaning_fee, data = numeric_listings) 
minQ1medianQ3maxmeansdnmissing
0891502007.41e+031641942414711584
favstats(~review_scores_rating, data = numeric_listings) 
minQ1medianQ3maxmeansdnmissing
20939810010094.69.581985815873

For the summary, we can conclude: 1. Price range is around 0 - 132358 USD, with average value 785.9 USD

  1. There are 4 types of room: Entire home/apt, hotel room, private room and shared room. What’s more, entire home/apt represents a significant portion of the total populations of accommodations on Airbnb

  2. The top 5 most popular neighbourhoods in Rio are Copacabana, Barra da Tijuca, Ipanema and Jacarepaguá.

  3. There are 36 different property types but 77% of the accommodations are apartments.

  4. Most of the hosts (87%) in Rio are not super host.

numeric_only_listings <- numeric_listings %>% 
  # Analyzing the numerical factors of the dataset
 select(
        host_is_superhost,
        accommodates,
        bedrooms,
        number_of_reviews,
        square_feet,
        price,
        weekly_price,
        cleaning_fee,
        extra_people,
        minimum_nights,
        maximum_nights,
        number_of_reviews,
        review_scores_rating
        )
# We analyze the correlation by plotting a matrix
numeric_only_listings %>%
  ggcorr( palette = "BlGr", label = TRUE, label_size=3, hjust=1,size=3) +
  theme(legend.title = element_text(size = 14))

# It looks like we can reduce the number of variables for the Scatterplot Matrix (corr >=0.2)
numeric_listings %>% 
  select(price,
         square_feet,
         weekly_price,
         number_of_reviews,
         review_scores_rating,
         extra_people
         ) %>%
ggpairs(axisLabels = "show")

As the color graph above displays, we can see that most variables have a weak (if any) correlation.

However, there are certain variables which exhibit a significant stastical correlation, such as Bedrooms & the number of people being accomodated, as well as price & weekly price. This makes sense, as the more bedrooms there are, the more one will be charged on average.

listings <- listings %>% 
  mutate(price = parse_number(price))

We also noticed that there are many missing values in the data set. There are 35731 NAs for neighbourhood_group_cleansed, 35279 NAs for square_feet, 33070 NAs for monthly_price, 33335 NAs for weekly_price, 15680 NAs for security_deposit, 11584 NAs for cleaning_fee, 15873 NAs for review_scores_rating, 79 NAs for beds, 79 NAs for bedrooms, 67 NAs for bathrooms, 5 NAs for host_is_superhost and 5 NAs for host_identity_verified.

Because NAs is the property not having these information, we will clean NAs equal to zeros.

# cleaning_fee values that are NAs equal to zero

numeric_listings <- numeric_listings %>%
  mutate(cleaning_fee = case_when(
    is.na(cleaning_fee) ~ 0, 
    TRUE ~ cleaning_fee)
    )
# property_type is a new variable we are creating which contains 5 different categories which includes the top 4 and others
numeric_listings <- numeric_listings %>%
  mutate(prop_type_simplified = case_when(
    property_type %in% c("Apartment","Condominium", "House","Loft") ~ property_type,
    TRUE ~ "Other"
  ))

The most common value for minimum nights is 1 night. The intended purpose is to provide flexibility to guests.

# minimum_nights values displayed

min_nights <- numeric_listings %>% 
  count(minimum_nights) %>% 
  arrange(desc(n))

# minimum_nights displayed in plot
ggplot(numeric_listings, aes(x=minimum_nights)) +
 geom_histogram(bins=10) + 
 labs(y="Count",x="Minimum nights",title="Frequency of minimum_nights") +
 xlim(0,50)+
  theme_bw()

Filter the airbnb data so that it only includes observations with minimum_nights <= 4

# Filter the data so that it only includes observations with minimum_nights <= 4
numeric_listings <- numeric_listings %>% filter(minimum_nights<=4)

Mapping

Moving on with our filtered data set, we want to have a mapping of Airbnb listings on Rio.

As seen below, the most properties are located along the waterfront. To no suprise, the further inland one goes the less properties are shown, and naturally there are no properties in the national park.

leaflet(data = filter(numeric_listings)) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

##Regression Analysis

# Create a new variable called price_4_nights

numeric_listings <- numeric_listings %>% 
  mutate(price_4_nights=case_when(
    guests_included==1 ~ 4*price+cleaning_fee+extra_people,
    TRUE ~ 4*price+cleaning_fee))

# density plots to examine the distributions of price_4_nights
numeric_listings %>% ggplot(aes(x=price_4_nights)) +
  geom_histogram() +
  xlim(0,15000)

# density plots to examine the distributions of log(price_4_nights)
numeric_listings %>% ggplot(aes(x=log(price_4_nights))) +
  geom_histogram() +
  xlim(0,12) +
  NULL

We should use log(price_4_nights) as the variable for the regression analysis as it is easier to be precise and to analyze the data than we the raw price_4_nights

Regression model called model1 with: prop_type_simplified, number_of_reviews, and review_scores_rating

# Creating the regression model
model1 <- lm(log(0.01+(price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating,
             data = numeric_listings)

# Display the coefficients
model1 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
termestimatestd.errorstatisticp.value
(Intercept)6.97 0.0615  113    0    
prop_type_simplifiedCondominium0.0150.0246  0.6050.545
prop_type_simplifiedHouse-0.2710.0232  -11.7  0    
prop_type_simplifiedLoft-0.1320.0396  -3.33 0.001
prop_type_simplifiedOther-0.1590.0258  -6.15 0    
number_of_reviews-0.0030.000186-13.9  0    
review_scores_rating0.0010.0006461.51 0.13 
# R squared value ?
model1 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.02040.020.80358.82.23e-726-2.04e+044.08e+044.08e+041.1e+041697116978

The review_scores_rating coefficient equals to 0.001. Having performed a logarithmic transformation of the dependent variable but not of the independent variables, we can interpret the effect of the coefficients as follows: we take the exponent of the estimate to remove the logarithm, subtract 1 from this result, and then multiply by 100 to obtain the percentage change in the dependent variable following a unit increase in the independent variable.

Thus, for each increase in the variable, the price will increase by [exp(0.001) - 1] * 100 = ~0.1%.

Moving on to property type, we note that the base category in this model is apartments. Hence, the coefficients for the other four property types can be interpreted as follows: - Condominium - coefficient: 0.015. Hence, if the the property is a condo, it adds ~1.51% to the price. - House - coefficient: -0.271. So, an Airbnb of this type has a price that is ~23.7% lower than an apartment. - Loft - coefficient: -0.132. This says that a loft is ~12.4% less expensive than an apartment. - Other - coefficient: -0.147. From this we see that any other property type besides a condo, house or loft costs ~14.7% more compared to an apartment.

In conclusion, we can see that the type of property having the most effect on our model is the “Other” type of property. This could be due to the fact that exotic types of property are more expensive than flats.

Overall, “model 1” has an adjusted R-squared value of 0.020. This means that <2% of the variance is described by this model. This is a very poor performance…

# Creating the regression model
model2 <- lm(log(0.01+(price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type,
             data = numeric_listings)

# View the coefficients
model2 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
termestimatestd.errorstatisticp.value
(Intercept)7.05 0.055   128    0    
prop_type_simplifiedCondominium0.0290.0219  1.33 0.184
prop_type_simplifiedHouse0.0620.0214  2.91 0.004
prop_type_simplifiedLoft-0.2620.0354  -7.39 0    
prop_type_simplifiedOther0.0060.0244  0.2520.801
number_of_reviews-0.0030.000167-20.7  0    
review_scores_rating0.0030.0005784.45 0    
room_typeHotel room-0.5570.0703  -7.92 0    
room_typePrivate room-0.8060.0128  -62.8  0    
room_typeShared room-1.1  0.042   -26.2  0    
# check the R squared value
model2 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.2190.2190.71752909-1.84e+043.69e+043.7e+048.73e+031696816978

We can see that room_type has not improved much the models with an adjusted R squared value to ~0.22.This is a huge improvements from the previous model! Bathrooms, bedrooms, beds, or size of the house (accomodates) significant predictors of price_4_nights?

# Bathrooms, bedrooms, beds and accommodates into model2
model3_withbed <- lm(log(0.01+(price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type +
               bedrooms +
               bathrooms +
               beds +
               accommodates,
             data = numeric_listings)

# View the coefficients
m3_sig_factors_withbed <- model3_withbed %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m3_sig_factors_withbed
termestimatestd.errorstatisticp.value
(Intercept)6.36 0.0498  128    0    
prop_type_simplifiedCondominium-0.0490.0194  -2.52 0.012
prop_type_simplifiedHouse-0.1510.0192  -7.87 0    
prop_type_simplifiedLoft0.0240.0316  0.7480.455
prop_type_simplifiedOther0.0270.0215  1.25 0.213
number_of_reviews-0.0030.000147-18.7  0    
review_scores_rating0.0020.0005114.09 0    
room_typeHotel room-0.3560.062   -5.74 0    
room_typePrivate room-0.5220.0126  -41.5  0    
room_typeShared room-0.8120.038   -21.3  0    
bedrooms0.2160.00827 26.1  0    
bathrooms0.1510.00798 19    0    
beds-0.0320.00351 -9.15 0    
accommodates0.0430.00278 15.4  0    
# check the R squared value
model3_withbed %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.3980.3980.63858013-1.61e+043.23e+043.24e+046.7e+031686516879

Thanks to these new variables, our Adj. R squared increased to 0.39 which is a lot better than our previous model.

# Remove beds variable and run model3 again

# Add bathrooms, bedrooms, beds and accommodates to model2
model3 <- lm(log((0.01+price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type +
               bedrooms +
               bathrooms +
               accommodates,
             data = numeric_listings)

# View the coefficients
m3_sig_factors <- model3 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m3_sig_factors
termestimatestd.errorstatisticp.value
(Intercept)6.35 0.0498  127    0    
prop_type_simplifiedCondominium-0.0510.0194  -2.64 0.008
prop_type_simplifiedHouse-0.1560.0192  -8.12 0    
prop_type_simplifiedLoft0.0210.0316  0.68 0.496
prop_type_simplifiedOther0.0190.0215  0.8950.371
number_of_reviews-0.0030.000147-18.9  0    
review_scores_rating0.0020.0005114.44 0    
room_typeHotel room-0.38 0.062   -6.12 0    
room_typePrivate room-0.5190.0126  -41.3  0    
room_typeShared room-0.8610.0373  -23.1  0    
bedrooms0.1980.00797 24.9  0    
bathrooms0.1420.00792 17.9  0    
accommodates0.0330.00255 12.8  0    
# check the R squared value
model3 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.3950.3950.632922012-1.62e+043.25e+043.26e+046.75e+031692216935

Removing beds, the adjusted R squared slightly decreases, and so we remove “beds”.

To avoid the overfitting in our model, we want to check for multicollinearity to see if we need to leave out any variables.

vif(model3)
##                          GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.293877  4        1.032730
## number_of_reviews    1.023532  1        1.011697
## review_scores_rating 1.007104  1        1.003546
## room_type            1.527245  3        1.073128
## bedrooms             2.692697  1        1.640944
## bathrooms            2.255957  1        1.501984
## accommodates         1.893678  1        1.376110

We have to take care of the VIF that are above 5 and if it is above 10, it is alarming. as we don’t have any values above 5, we want to keep all the variables so far.

Model 4: superhost

As we want to see if superhosts command a pricing premium we extend our model. We want to add the variable host_is_superhost as an extra explanatory variable after taking control of the other variables.

# add in host_is_superhost
model4 <- lm(log((0.01+price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type +
               bedrooms +
               bathrooms +
               accommodates +
               host_is_superhost,
             data = numeric_listings)

# View the coefficients
m4_sig_factors <- model4 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m4_sig_factors
termestimatestd.errorstatisticp.value
(Intercept)6.32 0.0501  126    0    
prop_type_simplifiedCondominium-0.0470.0194  -2.42 0.016
prop_type_simplifiedHouse-0.1540.0192  -8.03 0    
prop_type_simplifiedLoft0.0240.0316  0.7670.443
prop_type_simplifiedOther0.0210.0215  0.9910.322
number_of_reviews-0.0030.000157-16.1  0    
review_scores_rating0.0030.0005175.11 0    
room_typeHotel room-0.3720.062   -6.01 0    
room_typePrivate room-0.5190.0126  -41.3  0    
room_typeShared room-0.8630.0373  -23.1  0    
bedrooms0.1980.00796 24.9  0    
bathrooms0.1420.00792 17.9  0    
accommodates0.0330.00255 12.9  0    
host_is_superhostTRUE-0.0610.0127  -4.8  0    
# check the R squared value
model4 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.3960.3960.631854013-1.62e+043.25e+043.26e+046.74e+031692116935

The adjusted R squared increases by ~7.883e-3. We notice that host_is_superhost have a p-value of 0.000 which means that it is extremely significant.

Model 5: exact location

A significant proportion of owners advertise the exact location of their listing (is_location_exact == TRUE), while a significant proportion don’t. Now, we will explore whether a listing’s precise location significantly predicts price_4_nights after once more controlling for other variables.

# add in is_location_exact

model5 <- lm(log((0.01+price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type +
               bedrooms +
               bathrooms +
               accommodates +
               host_is_superhost +
               is_location_exact,
             data = numeric_listings)

# View the coefficients
m5_sig_factors <- model5 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m5_sig_factors
termestimatestd.errorstatisticp.value
(Intercept)6.32 0.0509  124    0    
prop_type_simplifiedCondominium-0.0470.0194  -2.42 0.016
prop_type_simplifiedHouse-0.1550.0192  -8.05 0    
prop_type_simplifiedLoft0.0240.0316  0.7680.442
prop_type_simplifiedOther0.0210.0215  0.9880.323
number_of_reviews-0.0030.000157-16.1  0    
review_scores_rating0.0030.0005175.11 0    
room_typeHotel room-0.3730.062   -6.01 0    
room_typePrivate room-0.5190.0126  -41.3  0    
room_typeShared room-0.8630.0373  -23.1  0    
bedrooms0.1980.00796 24.9  0    
bathrooms0.1420.00792 17.9  0    
accommodates0.0330.00255 12.9  0    
host_is_superhostTRUE-0.0610.0127  -4.77 0    
is_location_exactTRUE-0.0060.011   -0.5610.575
# check the R squared value
model5 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.3960.3960.631793014-1.62e+043.25e+043.26e+046.74e+031692016935

We didn’t improve the model so we will leave out `is_location_exact’ as a variable

Model 6: neighbourhood groupings

For all cities, there are 3 variables that relate to neighbourhoods: neighbourhood, neighbourhood_cleansed, and neighbourhood_group_cleansed. There are typically more than 20 neighbourhoods in each city, and it wouldn’t make sense to include them all in our model. [.] The new categorical variable that we create for this will be called neighbourhood_simplified.

We refer to our groupings as zones, which are as follows: - Zone 1: Maracanã - Zone 2: Caju, Centro, Catumbi, Cidade Nova, Lapa, Santa, Teresa - Zone 3: Copacabana, Ipanema, Botafogo, Catete, Cosme Velho - Zone 4: Barra de Tijuca

Rio is commonly broken down into 4 zones: North(1), Central(2), South(3) and West(4) and we have grouped accordingly above.

numeric_listings <- numeric_listings %>%
   mutate(zones = case_when(neighbourhood_cleansed %in% c("Maracanã") ~ "Zone1",
                            neighbourhood_cleansed %in% c("Caju, Centro, Catumbi, Cidade Nova, Lapa, Santa
                                                          Teresa") ~ "Zone2",
                            neighbourhood_cleansed %in% c("Copacabana, Ipanema, Botafogo, Catete, Cosme Velho") ~ "Zone3",
                            neighbourhood_cleansed %in% c("Barra da Tijuca") ~ "Zone4"))

price_neighbourhoods <- numeric_listings %>% 
  group_by(zones) %>% 
  summarise(mean= mean(price_4_nights)) %>% # convert to GBP
  arrange(desc(mean))
## `summarise()` ungrouping output (override with `.groups` argument)
model6 <- lm(log(0.01+price_4_nights) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type +
               bedrooms +
               bathrooms +
               accommodates +
               host_is_superhost +
               zones,
             data = numeric_listings)

# View the coefficients
m6_sig_factors <- model6 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m6_sig_factors
termestimatestd.errorstatisticp.value
(Intercept)5.86 0.16    36.6  0    
prop_type_simplifiedCondominium-0.1170.0367  -3.18 0.001
prop_type_simplifiedHouse-0.1880.051   -3.69 0    
prop_type_simplifiedLoft0.0750.0926  0.8080.419
prop_type_simplifiedOther-0.0890.0455  -1.96 0.05 
number_of_reviews-0.0050.000603-8.27 0    
review_scores_rating0.0050.00152 3    0.003
room_typeHotel room-0.2860.152   -1.88 0.06 
room_typePrivate room-0.4060.0373  -10.9  0    
room_typeShared room-0.7530.103   -7.29 0    
bedrooms0.1570.0244  6.44 0    
bathrooms0.1270.0198  6.39 0    
accommodates0.0510.00857 5.97 0    
host_is_superhostTRUE-0.0920.0352  -2.61 0.009
zonesZone40.4780.0693  6.89 0    
# check the R squared value
model6 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.4680.4640.561142.7e-23714-1.53e+033.09e+033.18e+0357018201835

This model is much better compared to model5 before. Specifically, the adjusted R squared now became 0.4640166. Again, we check for multicollinearity to see if the improvement of the model is wrongly due to the interaction between the explanatory variables.

vif(model6)
##                          GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.421483  4        1.044943
## number_of_reviews    1.165108  1        1.079401
## review_scores_rating 1.033025  1        1.016378
## room_type            1.710524  3        1.093591
## bedrooms             3.653351  1        1.911374
## bathrooms            2.873010  1        1.694996
## accommodates         2.733020  1        1.653185
## host_is_superhost    1.170336  1        1.081821
## zones                1.059802  1        1.029467

As no VIF scores are anywhere close to 5 we conclude that we still have no multicollinearity in the model, and hence we move forward with this model.

Model 7: cancellation policy

Next, we want to see how the cancellation policy affects the price_4_nights. This variable is a categorical variable with the possible values “moderate”, “flexible”, “strict_14_with_grace_period”, and “super_strict_60”.

model7 <- lm(log((0.01+price_4_nights)) ~ 
               prop_type_simplified + 
               number_of_reviews + 
               review_scores_rating +
               room_type +
               bedrooms +
               bathrooms +
               accommodates +
               host_is_superhost +
               zones +
               cancellation_policy,
             data = numeric_listings)

m7_sig_factors <- model7 %>% tidy() %>% mutate(p.value=round(p.value,3), estimate=round(estimate,3))
m7_sig_factors
termestimatestd.errorstatisticp.value
(Intercept)5.86 0.16    36.6  0    
prop_type_simplifiedCondominium-0.1150.0368  -3.12 0.002
prop_type_simplifiedHouse-0.1850.0511  -3.63 0    
prop_type_simplifiedLoft0.0750.0926  0.8130.416
prop_type_simplifiedOther-0.0930.0456  -2.03 0.042
number_of_reviews-0.0050.000609-8.04 0    
review_scores_rating0.0050.00152 3.04 0.002
room_typeHotel room-0.2860.152   -1.87 0.061
room_typePrivate room-0.4080.0375  -10.9  0    
room_typeShared room-0.7540.103   -7.29 0    
bedrooms0.1540.0245  6.29 0    
bathrooms0.1290.0199  6.5  0    
accommodates0.0510.00858 5.93 0    
host_is_superhostTRUE-0.0890.0353  -2.51 0.012
zonesZone40.4770.0694  6.88 0    
cancellation_policymoderate-0.0110.0359  -0.3020.762
cancellation_policystrict0.2110.562   0.3750.708
cancellation_policystrict_14_with_grace_period-0.0250.0317  -0.7770.437
cancellation_policysuper_strict_600.3  0.159   1.89 0.059
model7 %>% glance()
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
0.4690.4640.5689.32.94e-23418-1.53e+033.1e+033.21e+0356918161835

The adjusted R squared improved again.

Model Comparison

In the end, we are left with two models to choose between: model6 and model5'. To compare the models side by side, we have created a HUX table. In the table below,model6is to the left, andmodel5` is to the right.

huxreg("model6"=model6,
       "model5"=model5,
       statistics = c('#observations' = 'nobs', 
                                'R squared' = 'r.squared', 
                                'Adj. R Squared' = 'adj.r.squared', 
                                'Residual SE' = 'sigma'), 
       bold_signif = 0.05, 
       stars = NULL
) %>% 
  set_caption('Comparison of models')
Comparison of models
model6model5
(Intercept)5.857 6.324 
(0.160)(0.051)
prop_type_simplifiedCondominium-0.117 -0.047 
(0.037)(0.019)
prop_type_simplifiedHouse-0.188 -0.155 
(0.051)(0.019)
prop_type_simplifiedLoft0.075 0.024 
(0.093)(0.032)
prop_type_simplifiedOther-0.089 0.021 
(0.046)(0.022)
number_of_reviews-0.005 -0.003 
(0.001)(0.000)
review_scores_rating0.005 0.003 
(0.002)(0.001)
room_typeHotel room-0.286 -0.373 
(0.152)(0.062)
room_typePrivate room-0.406 -0.519 
(0.037)(0.013)
room_typeShared room-0.753 -0.863 
(0.103)(0.037)
bedrooms0.157 0.198 
(0.024)(0.008)
bathrooms0.127 0.142 
(0.020)(0.008)
accommodates0.051 0.033 
(0.009)(0.003)
host_is_superhostTRUE-0.092 -0.061 
(0.035)(0.013)
zonesZone40.478      
(0.069)     
is_location_exactTRUE     -0.006 
     (0.011)
#observations1835     16935     
R squared0.468 0.396 
Adj. R Squared0.464 0.396 
Residual SE0.560 0.631 

Model Diagnostics

To check the model diagnostics we want to use the autoplot(). In order to test the linearity hypothesis, we want to create a horizontal line in the residues in relation to the adjusted plots. The points should ideally be drawn along the dotted line for the qqplot. For scaled plotting, a horizontal line is desirable with points evenly distributed above and below to satisfy the assumption of homogeneity in a linear regression (i.e. the variance of the residuals is constant).

autoplot(model6) +
     theme_minimal() + 
     labs (title = "Model6 Diagnostic Plots")

Forecasting

To conclude our research, we will use a model to predict the price for an Airbnb listing for 2 guest and 4 nights. We use (https://www.airbnb.fr/rooms/12290017?source_impression_id=p3_1603035619_fqRCUYlBbCh3Zya7) that is an apartment with a private room, has at least 10 reviews, and an average rating of at least 90.

We will include the appropriate 95% interval with our prediction and report this and the point prediction in terms of price_4_nights.

library(stats)
library(datasets)
library(prediction)

# Requirements: private room, >=10 reviews, average rating of >=90.
# Filter on requirements and take a random subsample
set.seed(1234)
subset_requirements <- numeric_listings %>% 
  filter(!is.na(square_feet),
         accommodates>=2,
         number_of_reviews>=10, 
         review_scores_rating >= 90,
         prop_type_simplified == "Apartment",
         room_type == "Private room") %>%
  sample_n(size=20) # These will be the observations we want to predict

training_set <- anti_join(numeric_listings, subset_requirements) # This initializes our model

# Run model on the training set
training_model <- lm(log(price_4_nights) ~ 
                       prop_type_simplified + 
                       review_scores_rating +
                       room_type +
                       bedrooms +
                       zones +
                       square_feet,
                     data = training_set)

# Predict price_4_nights for the subset_requirements
forecast_value <- predict(training_model, newdata = subset_requirements,interval = "confidence")

# Take exponent of fitted values to remove the log transformation
forecast_value <- as.data.frame(forecast_value) %>% 
  mutate(exp_fit = exp(fit),
         exp_lower = exp(lwr), 
         exp_upper = exp(upr),
         real_price = subset_requirements$price_4_nights,
         predic_error = real_price - exp_fit)

mean(forecast_value$real_price) 
## [1] 826.2
average_prediction <- mean(forecast_value$real_price)
CI_upper <- quantile(forecast_value$real_price, 0.975)
CI_lower <- quantile(forecast_value$real_price, 0.025)
CI_lower
##   2.5% 
## 372.35
CI_upper
##  97.5% 
## 1919.5

In this project, we wanted to predict the prices of an Airbnb listing for 2 guests during 4 nights. Our final model was based on the highest R squared value, which was model6.